// this file geets called within build_secc.do


{
  // get unique household and person identifiers
tostring grampanchayatcode govt_pub_sect dwling_rms lgly_ own_oprt highest main kcc, replace
replace grampanchayatcode="9999" if mi(grampanchayatcode)
tostring ahlblockno ahlsubblockno ahlslnohhd, replace
gen hh_id = ahlblockno + ahlsubblockno + ahlslnohhd
sort statecode districtcode tehsilcode grampanchayatcode towncode hh_id slnomember age
egen temp_hh_group = group(statecode districtcode tehsilcode grampanchayatcode towncode hh_id)		
duplicates drop temp_hh_group slnomember age, force
replace slnomember = subinstr(slnomember, "-", "0", .)
destring slnomember, gen(person_id)
duplicates t temp_hh_group person_id, gen(temp_dup)
sort temp_hh_group person_id
gen temp_row = _n
egen temp_dup_row_min = min(temp_row), by(temp_hh_group person_id)
gen temp_dup_add = temp_row-temp_dup_row_min
egen temp_mem_max = max(person_id),  by(temp_hh_group)
egen temp_dup_max = max(temp_dup), by(temp_hh_group)
replace person_id = person_id + temp_mem_max + temp_dup_add if temp_dup>0 & temp_dup_add>0
*unique temp_hh_group person_id // should be unique!
drop wardid ahlblockno ahlsubblockno ahlslnohhd slnomember member_status temp* 

	// destring and rename variables 
destring statecode districtcode tehsilcode grampanchayatcode towncode typeofhhd ///
         age dwling_rms totalirr totalunirr otherirr, replace	
		 
rename statecode st_code
rename districtcode dt_code
rename tehsilcode bk_code
rename grampanchayatcode gr_code
rename towncode tn_code
rename statename state 
rename districtname district
rename tehsilname block
rename townname village
rename genderid gender
rename marrital_status marital
rename dwling_rms hh_rooms
rename govt_pub_sect pub_priv_sect
rename ownership_status hh_own
rename educode edu_level
rename pub_priv_sect pub_priv
rename wall_material wall_mat
rename roof_material roof_mat
rename own_telephone_mobile_phone own_phone 
rename *irrigated *irr
rename irrigation_equipment irr_equip
rename own_motorized_wheeler own_motor_veh
rename own_refrigerator own_frig
rename lgly_rlsd_bnd_lbr bonded_labor
rename kcc_wt_cr_lt_of_at_50k credit_card


	// variable-by-variable string shortening (LOTS of file-size reduction happening here!)
replace gender = substr(gender,1,1)

replace caste_group = "" if caste_group=="NO CASTE/TRIBE"
replace caste_group = "OT" if caste_group=="OTHER"

replace marital = "M" if marital=="CURRENTLY MARRIED"
replace marital = "D" if marital=="DIVORCED"
replace marital = "N" if marital=="NEVER MARRIED"
replace marital = "S" if marital=="SEPARATED"
replace marital = "W" if marital=="WIDOWED"

replace pub_priv = substr(pub_priv,1,2)
replace pub_priv = "NA" if pub_priv=="0"

replace hh_own = "O" if hh_own=="OWNED"
replace hh_own = "R" if hh_own=="RENTED"
replace hh_own = "N" if hh_own=="ANY OTHER"

replace wall_mat = "OTH" if wall_mat=="ANY OTHER"
replace wall_mat = "BBK" if wall_mat=="BURNT BRICK"
replace wall_mat = "CON" if wall_mat=="CONCRETE"
replace wall_mat = "GMA" if wall_mat=="GI/METAL/ASBESTOS SHEETS"
replace wall_mat = "GTB" if wall_mat=="GRASS/THATCH/BAMBOO"
replace wall_mat = "MUB" if wall_mat=="MUD/UNBURNT BRICK"
replace wall_mat = "PLP" if wall_mat=="PLASTIC/POLOTHENE" | wall_mat == "PLASTIC/POLYTHENE"
replace wall_mat = "STN" if wall_mat=="STONE NOT PACKED WITH MORTAR"
replace wall_mat = "STM" if wall_mat=="STONE PACKED WITH MORTAR"
replace wall_mat = "WOO" if wall_mat=="WOOD"

replace roof_mat = "OTH" if roof_mat=="ANY OTHER"
replace roof_mat = "BBK" if roof_mat=="BURNT BRICK"
replace roof_mat = "CON" if roof_mat=="CONCRETE"
replace roof_mat = "GMA" if roof_mat=="GI/METAL/ASBESTOS SHEETS"
replace roof_mat = "GTB" if roof_mat=="GRASS/THATCH/BAMBOO/WOOD/MUD"
replace roof_mat = "HMT" if roof_mat=="HAND MADE TILE"
replace roof_mat = "MMT" if roof_mat=="MACHINE MADE TILE"
replace roof_mat = "PLP" if roof_mat=="PLASTIC/POLOTHENE" | roof_mat == "PLASTIC/POLYTHENE"
replace roof_mat = "SLA" if roof_mat=="SLATE"
replace roof_mat = "STO" if roof_mat=="STONE"

replace edu_level = substr(edu_level,1,2)
replace edu_level = "<P" if edu_level=="BE"
replace edu_level = "HS" if edu_level=="HI" // higher secondary, not > secondary

replace highest_mnth_inc = subinstr(highest_mnth_inc,"BETWEEN","",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc,"LESS THAN","<",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc,"UNDER","<",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc,"GREATER THAN",">",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc,"MORE THAN",">",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc,"OVER",">",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc,"AND","-",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc,"K","",.)
replace highest_mnth_inc = subinstr(highest_mnth_inc," ","",.)

replace main_src_of_hh_inc = "CUL" if main_src_of_hh_inc=="CULTIVATION"
replace main_src_of_hh_inc = "FOR" if main_src_of_hh_inc=="FORAGING/RAG PICKING"
replace main_src_of_hh_inc = "LAB" if main_src_of_hh_inc=="MANUAL CASUAL LABOUR"
replace main_src_of_hh_inc = "ENT" if main_src_of_hh_inc=="NON AGRICULTURE OWN ENTERPRISE"
replace main_src_of_hh_inc = "OTH" if main_src_of_hh_inc=="OTHERS"
replace main_src_of_hh_inc = "DOM" if main_src_of_hh_inc=="PART/FULL TIME DOMESTIC SERVICE"

replace own_phone = substr(own_phone,1,1)

	// convert yes/no variables to 1/0
foreach v of varlist salaried_job own_oprt_ent_reg_wt_gvt own_frig own_motor_veh ///
             mech_3_4_wheeler_agr_eqp irr_equip own_any_land ///
						 bonded_labor manual_scavenger credit_card  {
	replace `v' = "1" if upper(`v')=="YES"
	replace `v' = "0" if upper(`v')=="NO"
	destring `v', replace
}						 

	// trim strings and drop always-missing variables
foreach v of varlist * {
	cap replace `v' = upper(trim(itrim(`v')))
	count if mi(`v')==0
	if r(N)==0 {
		drop `v'
	}
}
	
	// shorten state name to 3 characters, since it's still unambiguous
replace state = "UP" if state=="UTTAR PRADESH"
replace state = substr(state,1,3)

**** CLEAN OCCUPATION
{
* basic cleaning stuff
forvalues i = 0/9 {
  replace occupation = subinstr(occupation, "`i'", "", .)
}
replace occupation = subinstr(occupation, "-", "", .)
replace occupation = subinstr(occupation, `"""', "", .)
replace occupation = subinstr(occupation, "'", "", .)
replace occupation = subinstr(occupation, ",", "", .)
replace occupation = subinstr(occupation, ".", "", .)
replace occupation = subinstr(occupation, "\", "", .)
replace occupation = subinstr(occupation, ";", "", .)
replace occupation = subinstr(occupation, "/", "", .)
replace occupation = subinstr(occupation, "_", "", .)
replace occupation = itrim(occupation)
replace occupation = trim(occupation)
replace occupation = lower(occupation)

** take it out to clean further
preserve
duplicates tag occupation, gen(dupes)
keep occupation dupes
duplicates drop 
sort occupation

* group occupation by strings
strgroup occupation, gen(occgroup) threshold(0.25) normalize(shorter) noclean


** make a "cleaned" occupation that's the most-common version in the group
gsort occgroup -dupes
by occgroup: gen rank = _n
gen occ_cleaned = ""
by occgroup: replace occ_cleaned = occupation[1]

keep occupation occ_cleaned dupes

* do some further cleaning
replace occ_cleaned = "agriculture" if regexm(occ_cleaned, "agri") == 1 | /// 
  regexm(occ_cleaned, "animal") == 1 | regexm(occ_cleaned, "farm") == 1 | ///
  regexm(occ_cleaned, "cultiv") == 1
replace occ_cleaned = "student" if regexm(occ_cleaned, "stud") == 1
replace occ_cleaned = "dependent" if regexm(occ_cleaned, "depe") == 1
replace occ_cleaned = "shopkeep" if regexm(occ_cleaned, "shop") == 1
replace occ_cleaned = "retired" if regexm(occ_cleaned, "retir") == 1
replace occ_cleaned = "auto" if regexm(occ_cleaned, "driver") == 1 | ///
  regexm(occ_cleaned, "auto")
replace occ_cleaned = "house" if regexm(occ_cleaned, "house") == 1
replace occ_cleaned = "child" if regexm(occ_cleaned, "child") == 1
replace occ_cleaned = "tailor" if regexm(occ_cleaned, "tailor") == 1
replace occ_cleaned = "worker" if regexm(occ_cleaned, "worker") == 1 | ///
   regexm(occ_cleaned, "labor") == 1 |  regexm(occ_cleaned, "labour") == 1 

* keep only the first word
split occ_cleaned
replace occ_cleaned =  occ_cleaned1 
drop occ_cleaned?

* group again
strgroup occ_cleaned, gen(occgroup2) threshold(0.4) normalize(shorter) noclean

* replace again
gsort occgroup2 -dupes
by occgroup2: gen rank = _n
gen occ_cleaned2 = ""
by occgroup: replace occ_cleaned2 = occ_cleaned[1]

* make this the "cleaned" occupation variable
keep occupation occ_cleaned2
rename occ_cleaned2 occ_cleaned

save "$secc_raw/`f1'/`f2'/`f3'/temp_occupation.dta", replace

restore
* merge the cleaned version in
merge m:1 occupation using "$secc_raw/`f1'/`f2'/`f3'/temp_occupation.dta", nogen
* shorten the occupation string we're keeping for space
replace occupation = substr(occupation, 1, 15)
}
*****

  // clean relation (very cut and dry for now)
gen hh_head = relation=="HEAD"
drop relation

  // labels
  cap {
la var st_code "2001 state code"
la var dt_code "District code (2001)"
la var bk_code "Block code (2001)"
la var gr_code "Grampanchayat code"
la var tn_code "Town code (not sure if this maps to any other datasets)"
la var village "Village name"
la var hh_id "Household id (unique within village)"
la var person_id "Person id (unique within household)"
la var gender "Gender"
la var age "Age"
la var marital "Marital status (S=separated, N=never married)"
la var edu_level "Education level (illiterate, primary, middle, secondary, grad)"
la var hh_head "1/0 for household head (not perfectly coded)"
la var caste_group "SC/ST or Other or No Group (blank)"
la var wall_mat "Wall material (Mud/Unburnt Brick; Stone+Mortar; Grass/Thatch/Bamboo)"
la var roof_mat "Roof material (Grass/Thatch/Bamboo/wood/mud; Stone; Slate; HandMade Tile)"
la var hh_own "House is Owned/Rented/Neither"
la var hh_rooms "Dwelling rooms in house"
la var salaried_job "1/0 for holding salaried job"
la var pub_priv "Is salaried job Public or Private?"
la var highest_mnth_inc "Highest monthly income for household (Rupees)"
la var main_src_of_hh_inc "Main source of household income"
la var own_frig "1/0 own refrigerator"
la var own_phone "1/0 own phone (Mobile, None)"
la var own_motor_veh "1/0 own motorized wheeler"
la var mech_3_4 "1/0 own mechanized wheeler (agr equip)"
la var irr_equip "1/0 own irrigation equipment"
la var own_any_land "1/0 own any land"
la var totalirr "Total land irrigirated: 2 crops(acres)"
la var totalunirr "Total land unirrigated (acres)"
la var otherirr "Other land irrigated (acres)"
la var occupation "Occupation"
la var occ_cleaned "Occupation (cleaned)"
la var credit_card "1/0 has Kisan CC w 50k+ limit"
la var bonded_labor "1/0 any hh member is in bonded labor"
la var manual_scavenger "1/0 any hh member is a scavenger"
}
  // package for final outputs
order st_code-village hh_id person_id gender age marital edu_level hh_head caste_group
sort st_code-village hh_id person_id   
compress
}
